#Totes les llibreries que necessitem
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pandas_profiling import ProfileReport
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline
from sklearn import metrics
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from matplotlib.colors import ListedColormap
from mlxtend.plotting import plot_decision_regions
#Carreguem les dades, train i test
train=pd.read_csv('titanic/train.csv')
test = pd.read_csv('titanic/test.csv')
train.head()
#Estudi de normalitat en Age i Fare
#Per mostres més grans de 50 Kolmogorov-Smirnov, mitja coneguda poblacional.
stats.kstest(train['Age'], 'norm')
#Mitja poblacional no coneguda, Lilliefors
from statsmodels.stats.diagnostic import lilliefors
lilliefors(train['Fare'], dist='norm', pvalmethod=None)
#Shapiro
stats.shapiro(train['Age'])
stats.shapiro(train['Age'])
#Estudi de homogeneïtat Age i Fare
#Bartlett
stats.bartlett(train['Age'], train['Fare'])
# Anàlisi visual, distribució dels quantils
from statsmodels.graphics.gofplots import qqplot
qqplot(train['Age'], line='s')
plt.show()
qqplot(train['Fare'], line='s')
plt.show()
train.shape
test.shape
ax = sns.boxplot(x="Age", data=train)
ax = sns.boxplot(x="Fare", data=train)
sns.set(color_codes=True)
fig, ax = plt.subplots()
fig.set_size_inches(11.7, 8.27)
sns.distplot(train['Fare'])
#Discretitzar edad, histograma
sns.set(color_codes=True)
fig, ax = plt.subplots()
fig.set_size_inches(11.7, 8.27)
sns.distplot(train['Age'])
#Requereix downgrade package visions to 0.4.1
ProfileReport(train)
mitja=train['Fare'].groupby([train['Pclass']]).mean()
suma=train['Fare'].groupby([train['Pclass']]).sum()
numero=train['Fare'].groupby([train['Pclass']]).count()
despesa = pd.concat([numero.rename('numero'), suma.rename('despesaTotal'), mitja.rename('mitja')], axis=1, sort=False)
despesa
despesa['cum_sumNum'] = despesa['numero'].cumsum()
despesa['cum_sumTot'] = despesa['despesaTotal'].cumsum()
despesa['percNum'] = 100*despesa['cum_sumNum']/despesa['numero'].sum()
despesa['percTot'] = 100*despesa['cum_sumTot']/despesa['despesaTotal'].sum()
despesa
from yellowbrick.regressor import CooksDistance
from yellowbrick.datasets import load_concrete
X=np.asarray(train['Fare']).reshape(-1,1)
y=np.asarray(train['Survived'])
visualizer = CooksDistance()
visualizer.fit(X,y)
visualizer.show()
edat = pd.DataFrame(train.dropna())
X=np.asarray(edat['Age']).reshape(-1,1)
y=np.asarray(edat['Survived'])
visualizer = CooksDistance()
visualizer.fit(X,y)
visualizer.show()
#Avaluem algunes variables amb una regressió logística, Analitzem Pclass, Sex, Fare, Embarked
reglog = train.drop(['PassengerId', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket','Cabin'], axis=1) #Creem una dataset
reglog["Embarked"].fillna("S", inplace = True) #Hi ha 2 na's a embarked. Per avaluar la varialb imputem majoria
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
reglog["Sex"] = enc.fit_transform(reglog["Sex"]) #Codifiquem 1,0 sexe
reglog["Embarked"] = enc.fit_transform(reglog["Embarked"]) #Codifiquem 0,1,2 on embarquen.
X_log = reglog.drop(['Survived'], axis=1)
y_log = reglog['Survived']
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
reglog_model = LogisticRegression(solver='liblinear', random_state=0)
reglog_model.fit(X_log, y_log)
print(classification_report(y_log, reglog_model.predict(X_log)))
importance = reglog_model.coef_[0]
var_imp = pd.DataFrame(zip(importance,X_log.columns.values ))
sns.set(style='whitegrid')
ax = sns.barplot(x= var_imp[1], y= var_imp[0], data=var_imp)
#Discretitzem Fare amb cut, treballarem 10 trams que cada tram no te perquè tenir els mateixos integrants.
reglog['Fare']=train['Fare']
reglog['Fare_d']=pd.qcut(reglog['Fare'], 5, labels=range(5))
pd.crosstab(reglog.Fare_d, reglog.Survived, margins=True, normalize='columns')
#Tornem a avaluar el model, ara substituim la variable Fare_d x Fare
reglog['Fare']=train['Fare']
reglog['Fare_d']=pd.qcut(reglog['Fare'], 5, labels=range(5))
reglog=reglog.drop(['Fare'], axis=1)
X_log = reglog.drop(['Survived'], axis=1)
y_log = reglog['Survived']
reglog_model = LogisticRegression(solver='liblinear', random_state=0)
reglog_model.fit(X_log, y_log)
print(classification_report(y_log, reglog_model.predict(X_log)))
importance = reglog_model.coef_[0]
var_imp = pd.DataFrame(zip(importance,X_log.columns.values ))
sns.set(style='whitegrid')
ax = sns.barplot(x= var_imp[1], y= var_imp[0], data=var_imp)
#missing values, treballem la imputació de dades buides edat.
train.isna().sum()
#Quina rlació hi ha entre cabin i fare? Podriem treballar els NA's de Cabin?
grup=train['Fare'].groupby([train['Pclass']])
grupmedia=grup.mean()
print(pd.DataFrame(grupmedia))
train['Fare'].corr(train['Pclass'])
#Ara tenim una base de dades amb 177 missing d'edat de 890 passatgers
#Podriem intuir edat per classe?
train['Age'].groupby([train['Pclass']]).mean()
#Hem de començat amb les taules de contingència. Primer les obvies. Sex and Survival
pd.crosstab(train.Sex, train.Survived, margins=True)
#Afegim classe
taula_s = pd.crosstab([train.Sex, train.Survived], train.Pclass, margins=True, normalize='columns')
taula_s
plt.rcParams['figure.figsize'] = (10, 5)
sns.heatmap(taula_s, annot=True, cmap="YlGnBu")
#Anem a buscar 2 dades noves. Tenim els noms que ens indica que son casats o no. i també si son familia entre sí.
#1-casat (Mr), 2- casada (Mrs), 3- Soltera(Mss), 4-Master(jefe)
train['state']=train['Name'].apply(lambda x: 'Mr' if 'Mr.' in x else
'Mr' if 'Don' in x else
'Mr' if 'Major' in x else
'Mrs' if 'Mrs.' in x else
'Mrs' if 'Mrs' in x else
'Mrs' if 'Countess' in x else
'Mr' if 'Capt' in x else
'Mr' if 'Jonkheer' in x else
'Mr' if 'Rev' in x else
'Mr' if 'Col' in x else
'Mrs' if 'Mme' in x else
'Miss' if 'Miss.' in x else
'Miss' if 'Mlle' in x else
'Miss' if 'Ms' in x else
'Master' if 'Master' in x else
'Mr' if 'Dr.' in x else 'Mr')
#hi ha tickets iguals, que corresponen a la mateixa familia, podem calcular número de viatgers que son familia. suma SibSp + Parch
train['family'] = train['SibSp']+ train['Parch']+1
train.head()
#Afegim state en lloc de Sex
pd.crosstab([ train.state], train.Survived, margins=True)
#Propers càlculs. Discretitzar edat en franjes (ull viu missing data)
#Veiem que no hi ha correlació entre edat i despesa
train['Age'].corr(train['Fare'])
perc=[0.15,0.3,0.45,0.6,0.75,0.9]
train['Age'].quantile(perc)
#El que fem al discretitzar es substituir per un valor entre 0-6 en funció del percentil, hem creat 7 trams com hem vist abans.
#Fem el mateix amb Fare
train['edad']=pd.qcut(train['Age'], 7, labels=range(7))
train['Fare_d']=pd.qcut(train['Fare'], 5, labels=range(5))
train.head()
#Avaluem edat/state/family, nº integrants de la familia. Molts anaven a la mateixa cabina.
pd.crosstab([ train.edad, train.state, train.Survived], train.family, margins=True)
#Arribat a aquest punt podem imputar els valors missing en l'edat, però ho farem amb les mitjanes dels grups que pertànyin.
#Primer calculem la edat mitja dels grups que creem
train.groupby(['Pclass', 'state'])[['Age']].describe()
#Farem el seguent. fem copia 'Age', i posarem 0 les nan. avaluarem les característiques i imputarem una edad en funció del grup.
train['Age2']=train['Age']
train['Age2'].fillna(0,inplace=True)
Edad = train[train['Age2']== 0]
#Per problemes amb Python replace amb iloc, where... TypeError: unsupported operand type(s) for &: 'str' and 'Categorical' Creo 3 dataframes
Edad1 = Edad[Edad['Pclass'] == 1]
Edad2 = Edad[Edad['Pclass'] == 2]
Edad3 = Edad[Edad['Pclass'] == 3]
#Els 177 missing, distribució
pd.crosstab([ Edad.Pclass, Edad.state], Edad.Survived, margins=True)
#Ara si, faig el "replacement" en funció dels valors mitjos del grup a qui pertanyin.
Edad1.loc[(Edad.state == 'Miss'), 'Age'] = 30
Edad1.loc[(Edad.state == 'Mr'), 'Age'] = 42
Edad1.loc[(Edad.state == 'Mrs'), 'Age'] = 40
Edad2.loc[(Edad.state == 'Miss'), 'Age'] = 23
Edad2.loc[(Edad.state == 'Mr'), 'Age'] = 33
Edad3.loc[(Edad.state == 'Master'), 'Age'] = 5
Edad3.loc[(Edad.state == 'Miss'), 'Age'] = 16
Edad3.loc[(Edad.state == 'Mr'), 'Age'] = 29
Edad3.loc[(Edad.state == 'Mrs'), 'Age'] = 33
#imputem els valors calculats
index1=Edad1.index.values
index2=Edad2.index.values
index3=Edad3.index.values
#Incorporem les noves dades imputades a la dataset train
for i in index1:
train['Age'][i] = Edad1['Age'][i]
for i in index2:
train['Age'][i] = Edad2['Age'][i]
for i in index3:
train['Age'][i] = Edad3['Age'][i]
#Avaluem age amb el model de regressió
reglog['Age']=train['Age']
reglog['edad']=pd.qcut(reglog['Age'], 7, labels=range(7))
reglog=reglog.drop(['Age'], axis=1)
X_log = reglog.drop(['Survived'], axis=1)
y_log = reglog['Survived']
reglog_model = LogisticRegression(solver='liblinear', random_state=0)
reglog_model.fit(X_log, y_log)
print(classification_report(y_log, reglog_model.predict(X_log)))
importance = reglog_model.coef_[0]
var_imp = pd.DataFrame(zip(importance,X_log.columns.values ))
sns.set(style='whitegrid')
ax = sns.barplot(x= var_imp[1], y= var_imp[0], data=var_imp)
#Ara tornem a imputar edad, ja que tenim Age sense missing.
train['edad']=pd.qcut(train['Age'], 6, labels=range(0,6))
train.head()
#Comprovem na's. Tenim 2 Embarked i la majoria de Cabin.
#Fare té una relació molt alta amb Pclass, i Ticket ja indica si viatgen junts o no (també el nº family)
train.isna().sum()
#Ara ja només tenim missing en la variable 'Cabin', que son moltíssimes (687) i tampoc ens ofereix informació que no tingui ticket. = ticket = Cabin.
# Ja podem preparar la dataset per entrenament.
# Fem les modificacions que hem fet a train per test.
#Creem variable 'state'
test['state']=test['Name'].apply(lambda x: 'Mr' if 'Mr.' in x else
'Mr' if 'Don' in x else
'Mr' if 'Major' in x else
'Mrs' if 'Mrs.' in x else
'Mrs' if 'Mrs' in x else
'Mrs' if 'Countess' in x else
'Mr' if 'Capt' in x else
'Mr' if 'Jonkheer' in x else
'Mr' if 'Rev' in x else
'Mr' if 'Col' in x else
'Mrs' if 'Mme' in x else
'Miss' if 'Miss.' in x else
'Miss' if 'Mlle' in x else
'Miss' if 'Ms' in x else
'Master' if 'Master' in x else
'Mr' if 'Dr.' in x else 'Mr')
#Afegim 'family'
test['family'] = test['SibSp']+ test['Parch']+1
#Modifiquem Age
#Farem el seguent. fem copia 'Age', i posarem 0 les nan. avaluarem les característiques i imputarem una edad en funció del grup.
test['Age2']=test['Age']
test['Age2'].fillna(0,inplace=True)
Edad = test[test['Age2']== 0]
Edad1 = Edad[Edad['Pclass'] == 1]
Edad2 = Edad[Edad['Pclass'] == 2]
Edad3 = Edad[Edad['Pclass'] == 3]
Edad1.loc[(Edad.state == 'Miss'), 'Age'] = 30
Edad1.loc[(Edad.state == 'Mr'), 'Age'] = 42
Edad1.loc[(Edad.state == 'Mrs'), 'Age'] = 40
Edad2.loc[(Edad.state == 'Miss'), 'Age'] = 23
Edad2.loc[(Edad.state == 'Mr'), 'Age'] = 33
Edad3.loc[(Edad.state == 'Master'), 'Age'] = 5
Edad3.loc[(Edad.state == 'Miss'), 'Age'] = 16
Edad3.loc[(Edad.state == 'Mr'), 'Age'] = 29
Edad3.loc[(Edad.state == 'Mrs'), 'Age'] = 33
index1=Edad1.index.values
index2=Edad2.index.values
index3=Edad3.index.values
for i in index1:
test['Age'][i] = Edad1['Age'][i]
for i in index2:
test['Age'][i] = Edad2['Age'][i]
for i in index3:
test['Age'][i] = Edad3['Age'][i]
perc=[0.15,0.3,0.45,0.6,0.75,0.9]
test['Age'].quantile(perc)
test['edad']=pd.qcut(test['Age'], 6, labels=range(0,6))
#Columnes que eliminem
col = ['PassengerId', 'Name', 'Age', 'Ticket', 'Cabin', 'Age2', 'Fare']
train = train.drop(col, axis=1)
test = test. drop(col, axis=1)
#Convertim en "ojbect" les variables categòriques per poder fer pre-processament
col=['Pclass', 'Sex', 'edad', 'Fare_d']
train[col] = train[col].astype(str)
train['Survived']= train['Survived'].astype('category')
train.dtypes
import researchpy
#Podem crear una taula de contingència amb els percentatges directament
researchpy.crosstab(train['Survived'], train['Pclass'], prop= "cell")
# Per files
researchpy.crosstab(train['Survived'], train['Pclass'], prop= "row")
#O per columnes
researchpy.crosstab(train['Survived'], train['Pclass'], prop= "col")
#Però també podem avaluar l'anàlisi d'independència amb chi quadrat directament.
researchpy.crosstab(train['Survived'], train['Pclass'], prop='col', test= "chi-square")
Ens ofereix el quadre de contingència amb les proporcions per columnes.
Pearson Chi-square ( 2.0) = 102.8890 es el valor de l'estadístic.
p-value = 0.0000 indica la significància de la independència de les variables. p<0.05 hi ha una dependència.
Cramer's V = 0.3398 per valors majors a 0.5 indica una associació forta. Amb valors 0.3-0.5 associació moderada.
#Preparem les taules
categorical_features = train.select_dtypes(include=['object']).columns
taula=[]
pvalue=[]
cramer=[]
for i in categorical_features:
res = researchpy.crosstab(train['Survived'], train[i], prop='col', test='chi-square')
taula.append(res[0])
pvalue.append(res[1].iloc[1])
cramer.append(res[1].iloc[2])
cramer=pd.DataFrame(cramer)
cramer['variables'] = categorical_features
cramer
#Totes les taules de contingència
taula
#https://medium.com/vickdata/a-simple-guide-to-scikit-learn-pipelines-4ac0d974bdcf
#Ara avaluem quin model pot ser millor.
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
#Tenim 2 na en Emarked, les imputarem mitjançant SimpleImputer
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))])
#Dividim variables entre numériques i categòriques.
numeric_features = train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = train.select_dtypes(include=['object']).columns
from sklearn.compose import ColumnTransformer
#Es realitza la imputació, i la standarització de les variables numériques
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)])
#Aquí creem la dataset per l'entrenament.
all_features = train.drop("Survived",axis=1)
Targeted_feature = train["Survived"]
#Dividim train dataset en train i test, amb un 25% al grup test.
X_train, X_test, y_train, y_test = train_test_split(all_features,Targeted_feature,test_size=0.25,random_state=12)
#Fem una primera aproximaxió amb RandomForest preparem la dataset amb les passes donades.
from sklearn.ensemble import RandomForestClassifier
rf = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', RandomForestClassifier())]) #El model amb les dades pre-processades
#Entrenem
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
rf.score(X_test, y_test)
len(y_pred)
#GridSearch KNeighborsClassifier, ni millora 0.77
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
kn = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', KNeighborsClassifier())])
param_grid = {
'classifier__n_neighbors': [11,13,15],
'classifier__leaf_size': [1,2,3],
}
from sklearn.model_selection import GridSearchCV
GSkn = GridSearchCV(kn, param_grid, n_jobs= 1)
GSkn.fit(X_train, y_train)
print(GSkn.best_params_)
print(GSkn.best_score_)
print("model score: %.3f" % GSkn.score(X_test, y_test))
#GridSearch SVC
svc = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', SVC())])
param_grid = {
'classifier__gamma': [0.08,0.1,0.2],
'classifier__C': [0.8,1,1.2,2],
}
from sklearn.model_selection import GridSearchCV
GSsvc = GridSearchCV(svc, param_grid, n_jobs= 1)
GSsvc.fit(X_train, y_train)
print(GSsvc.best_params_)
print(GSsvc.best_score_)
print("model score: %.3f" % GSsvc.score(X_test, y_test))
#GridSearch nuSVC
nusvc = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', NuSVC())])
param_grid = {
'classifier__decision_function_shape': ['ovo', 'ovr'],
'classifier__gamma': ['scale', 'auto'],
'classifier__nu': [0.2,0.3,0.5],
'classifier__kernel': ['linear', 'rbf'],
}
from sklearn.model_selection import GridSearchCV
GSnusvc = GridSearchCV(nusvc, param_grid, n_jobs= 1)
GSnusvc.fit(X_train, y_train)
print(GSnusvc.best_params_)
print(GSnusvc.best_score_)
print("model score: %.3f" % GSnusvc.score(X_test, y_test))
#GridSearch DecisionClassifier
dtc = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', DecisionTreeClassifier())])
param_grid = {
'classifier__max_depth': [5,7,10],
'classifier__criterion': ['gini', 'entropy'],
'classifier__max_features': ['auto', 'sqrt', 'log2'],
'classifier__min_samples_split': [3,4,5],
}
from sklearn.model_selection import GridSearchCV
GSdtc = GridSearchCV(dtc, param_grid, n_jobs= 1)
GSdtc.fit(X_train, y_train)
print(GSdtc.best_params_)
print(GSdtc.best_score_)
print("model score: %.3f" % GSdtc.score(X_test, y_test))
#Ja hem entrenat abans. rf es definit GridSearch RandomForest
param_grid = {
'classifier__n_estimators': [100,200, 500],
'classifier__max_features': ['auto', 'sqrt', 'log2'],
'classifier__max_depth' : [4,5,6,7,8],
'classifier__criterion' :['gini', 'entropy'],
'classifier__min_samples_split' :[2,3,4,5]}
from sklearn.model_selection import GridSearchCV
GSrf = GridSearchCV(rf, param_grid, n_jobs= 1)
GSrf.fit(X_train, y_train)
print(GSrf.best_params_)
print(GSrf.best_score_)
print("model score: %.3f" % GSrf.score(X_test, y_test))
#GridSearch AdaBoostClassifier
DTC=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
max_depth=7, max_features='sqrt', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=4,
min_weight_fraction_leaf=0.0, presort='deprecated',
random_state=None, splitter='best')
abc = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', AdaBoostClassifier(base_estimator=DTC))])
param_grid = {
'classifier__n_estimators':[10,20],
'classifier__learning_rate': [0.001, 0.005,0.01]
}
from sklearn.model_selection import GridSearchCV
GSabc = GridSearchCV(abc, param_grid, n_jobs= 1)
GSabc.fit(X_train, y_train)
print(GSabc.best_params_)
print(GSabc.best_score_)
print("model score: %.3f" % GSabc.score(X_test, y_test))
#GridSearch GradientBoostingClassifier
gbc = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', GradientBoostingClassifier())])
param_grid = {
'classifier__learning_rate': [0.04, 0.05, 0.06],
'classifier__max_depth': [5,8,15],
'classifier__max_features': ['sqrt', 'log2'],
'classifier__min_samples_split': [7,8,10],
'classifier__n_estimators': [15,20,40]
}
from sklearn.model_selection import GridSearchCV
GSgbc = GridSearchCV(gbc, param_grid, n_jobs= 1)
GSgbc.fit(X_train, y_train)
print(GSgbc.best_params_)
print(GSgbc.best_score_)
print("model score: %.3f" % GSgbc.score(X_test, y_test))
#GridSearch Logistic Regression
lgr = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', LogisticRegression())])
param_grid = {
'classifier__solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
'classifier__max_iter':[500],
'classifier__multi_class': ['auto', 'ovr', 'multinomial'],
'classifier__penalty': ['l2'],
'classifier__C': [1,1.2,2],
}
from sklearn.model_selection import GridSearchCV
GSlgr = GridSearchCV(lgr, param_grid, n_jobs= 1)
GSlgr.fit(X_train, y_train)
print(GSlgr.best_params_)
print(GSlgr.best_score_)
print("model score: %.3f" % GSlgr.score(X_test, y_test))
#GridSearch XGBoostClassifier NO millora 0.807
xgb = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', XGBClassifier())])
param_grid = {
'classifier__objective': ['binary:logistic'],
'classifier__gamma': [0, 1, 2],
'classifier__colsample_bytree': [0.4, 0.6,1],
'classifier__max_depth': [ 4, 5, 6],
'classifier__n_estimators': [100, 200, 300],
'classifier__learning_rate': [0.01, 0.05, 0.005],
}
from sklearn.model_selection import GridSearchCV
GSxgb = GridSearchCV(xgb, param_grid, n_jobs= 1)
GSxgb.fit(X_train, y_train)
print(GSxgb.best_params_)
print(GSxgb.best_score_)
print("model score: %.3f" % GSxgb.score(X_test, y_test))
#Un cop treballat tunning amb els classificadors, fem el Pipeline
#Valor per configurar AdaBoost
DTC = DecisionTreeClassifier(criterion='entropy', max_depth=7, max_features='sqrt', min_samples_split=4)
classifiers = [
KNeighborsClassifier(5),
SVC(kernel="rbf", C=1, probability=True, gamma=0.1),
NuSVC(decision_function_shape='ovo', gamma='scale', kernel='rbf',
nu=0.5, probability=True ),
DecisionTreeClassifier(criterion='entropy', max_depth=7, max_features='sqrt', min_samples_split=4),
RandomForestClassifier(criterion='gini', max_depth=6, max_features='auto',
n_estimators=200),
AdaBoostClassifier(base_estimator=DTC,learning_rate=0.01, n_estimators=20 ),
GradientBoostingClassifier(
learning_rate=0.06, max_depth=8,
max_features='sqrt',
min_samples_split=8,
n_estimators=40,
),
LogisticRegression(C=1, max_iter=500, multi_class='multinomial', penalty='l2', solver='newton-cg'),
XGBClassifier(max_depth=8, n_estimators=200, learning_rate=0.02, objective='binary:logistic')
]
predicted = []
Scores =[]
for classifier in classifiers:
pipe = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', classifier)])
pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)
predicted.append(pred)
Scores.append(pipe.score(X_test, y_test))
print(classifier)
print("model score: %.3f" % pipe.score(X_test, y_test))
#Preparem un resum en un dataframe
Models =['KNeighborsClassifier', 'SVC','NuSVC', 'DecisionTreeClassifier', 'RandomForestClassifier',
'AdaBoostClassifier', 'GradientBoostingClassifier', 'LogisticRegression', 'XGBClassifier' ]
Resultats = pd.DataFrame(list(zip(Models,Scores)), columns=['Models', 'Scores'])
Resultats
# Stacking, seleccionem el millor score com model base. LogisticRegression (0.839)
# Agafem prediccions de models que entenguem que puguin ser linelalment independents de les LR
# Seleccionem, NuSVC, RandomForest, XGBoostClassifier
# Les prdiccions les hem guardat a la llista predicted
#Prediccions per NuSVC
pred_NuSVC = predicted[2].reshape(-1,1) #Convertim en columna, després one hot encoder
pred_rf = predicted[4].reshape(-1,1)
pred_abc = predicted[5].reshape(-1,1)
pred_xgb = predicted[8].reshape(-1,1)
enc=OneHotEncoder(categories='auto') #Convertim en OHE
#One Hote Encoding de cada predicció
enc.fit(pred_NuSVC)
Nu_SVC_oe=enc.transform(pred_NuSVC).toarray()
enc.fit(pred_rf)
rf_oe=enc.transform(pred_rf).toarray()
enc.fit(pred_abc)
abc_oe=enc.transform(pred_abc).toarray()
enc.fit(pred_xgb)
xgb_oe=enc.transform(pred_xgb).toarray()
#Creem una matriu amb les prediccions.
meta=np.column_stack((Nu_SVC_oe,rf_oe, xgb_oe, abc_oe ))
#Les 6 columnes de les 3 prediccions one hot amb les 223 rows from test
print(meta.shape)
LRs=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=500,
multi_class='multinomial', n_jobs=None, penalty='l2',
random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
warm_start=False).fit(meta, y_test)
scores_LRs=cross_val_score(LRs, meta, y_test, cv=3)
scores_LRs
y_pred = LRs.predict(meta)
print('Final prediction score: [%.8f]' % accuracy_score(y_test, y_pred))
#Cascading , afegim variables x_test a les prediccions
meta1=np.column_stack((meta,X_test))
print(meta1.shape)
#hi ha un sol missing value a Embarked.
X_test.fillna('S', inplace=True)
#Estem pasam onehotencoder les variables categoriques
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(X_test[categorical_features])
ohe = enc.fit_transform(X_test[categorical_features]).toarray()
meta1=np.column_stack((X_test[numeric_features], ohe, meta))
LRc=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=500,
multi_class='multinomial', n_jobs=None, penalty='l2',
random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
warm_start=False).fit(meta1, y_test)
scores_svc2=cross_val_score(LRc, meta1, y_test, cv=3)
scores_svc2
LRc.fit(meta1, y_test)
y_pred = LRc.predict(meta1)
print('Final prediction score: [%.8f]' % accuracy_score(y_test, y_pred))
#Finalment amb cascading augmento score de 0.83 a 0.85
print(classification_report(y_test, LRc.predict(meta1)))